This file visualises the distribution of effect sizes across the moderators, and then conducts moderation tests using univariate analyses, meta-regression and meta-CART approaches.
# For reproducibility package versions are locked to a specific date
if (!require(groundhog)) install.packages('groundhog')
groundhog::groundhog.library(c("readxl", "metaforest", "metafor", "tidyverse", "clubSandwich", "cli", "rsprite2", "esc", "ggtext",
"mice", "mitools", "metacart", "gt", "gtExtras", "psych", "furrr", "progressr", "micemd", "boot",
"cli"), date = "2023-07-09")
groundhog::groundhog.library(c("lukaswallrich/timesaveR"), date = "2023-07-09")
# Need more recent version of patchwork due to faceting bug
groundhog::groundhog.library(c("patchwork"), date = "2023-07-09")
source("helpers/metacart_fix.R") # metacart broke with R 4.2.0 due to a change in if-statement evaluation. This fix changes `class ==` to `inherits()`
source("helpers/metacart_plot.R")
source("helpers/helpers.R")
source("helpers/equivalence_testing.R")
# Prepare to display progress
handlers(
handler_cli()
)
options(cli.progress_show_after = 0)
options(cli.progress_clear = FALSE)
dataset <- read_rds("data/full_dataset.rds")
# Assumed correlation between dependent effect sizes
rho <- 0.6
# Get confidence intervals based on the adjusted standard errors (for effective sample size and unreliability)
get_cis <- function(r_adj, adjustment_ratio, n, conf.level = 0.95) {
# Reduce extreme estimates to those that would be rounded
if (r_adj == 1) r_adj <- .995
if (r_adj == -1) r_adj <- -.995
# Fisher's r to z transformation
z <- 0.5 * log((1 + r_adj) / (1 - r_adj))
alpha <- 1 - conf.level
z.critical <- qnorm(1 - alpha / 2)
se_adj <- (1 / sqrt(n - 3)) * adjustment_ratio
# Confidence interval calculation
ci.lower <- z - z.critical * se_adj
ci.upper <- z + z.critical * se_adj
# Transforming back to r
ci.lower.r <- (exp(2 * ci.lower) - 1) / (exp(2 * ci.lower) + 1)
ci.upper.r <- (exp(2 * ci.upper) - 1) / (exp(2 * ci.upper) + 1)
list(ci_low = ci.lower.r, ci_high = ci.upper.r)
}
dataset_spec <- dataset %>%
mutate(ci = pmap(list(r_adj, ifelse(r_adj == 0, 1, r_adj/r_rep), n_teams), get_cis)) %>%
mutate(ci_low = map_dbl(ci, 1), ci_high = map_dbl(ci, 2)) %>%
select(-ci) %>% arrange(r_adj) %>% rowid_to_column(var = "specification") %>%
mutate(color = case_when(
ci_high < 0 ~ "darkred",
ci_low <= 0 ~ "darkgrey",
ci_low > 0 ~ "darkgreen"
))
# Simplified code for creating the specification curve-type plot
# Derived from https://github.com/masurp/specr
plot_a <- dataset_spec %>%
ggplot(aes(x = specification, y = r_adj, ymin = ci_low, ymax = ci_high)) +
geom_pointrange(aes(color = color), alpha = 0.5, size = 0.6, fatten = 1) +
theme_minimal() +
scale_color_identity() +
theme(axis.line = element_line(linewidth = 0.5), legend.position = "none",
panel.spacing = unit(0.75, "lines"), axis.text = element_text(color = "black")) +
labs(x = "", y = "r (adjusted)") +
geom_hline(yintercept = 0)
choices <- c("Domain" = "domain", "Top team" = "tmt", "Students" = "stud_sample",
"Interdependence" = "interdep", "Complexity" = "complexity", "Longevity" = "longevity",
"Measure" = "meas_type", "Rater" = "rater", "Hypothesis" = "art_focus")
rev_choices <- setNames(names(choices), choices)
levels_replace <- tibble::tribble(
~level_old, ~level_new,
"focal H", "Focal hypothesis",
"auxiliary H", "Auxiliary hypothesis",
"descriptive", "Descriptive",
"high", "High",
"medium", "Medium",
"low", "Low",
"yes", "Yes",
"no", "No",
"Variety", "Variety",
"Separation", "Separation",
"Other", "Other",
"Objective", "Objective",
"Subjective - self", "Subjective - Self",
"Subjective - supervisor", "Subjective - Supervisor",
"Subjective - external", "Subjective - External",
"hours", "Hours",
"days", "Days",
"weeks", "Weeks",
"months", "Months",
"years", "Years",
"stable", "Stable",
"Demographic", "Demographic",
"Cognitive", "Cognitive",
"Job-related", "Job-Related"
) %>% {
setNames(.$level_new, .$level_old)
}
plot_b <- dataset_spec %>%
mutate(sub_dom = fct_lump_n(sub_dom, 5)) %>%
tidyr::gather(key = "key", value = "value", all_of(choices)) %>%
mutate(key = coalesce(rev_choices[key], key),
value = coalesce(levels_replace[value], value),
key = factor(key, levels = rev_choices),
value = factor(value, levels = levels_replace) %>% fct_rev()) %>%
filter(!is.na(value)) %>%
ggplot(aes(x = specification, y = value, color = color)) +
geom_point(shape = 124, size = 3.35, alpha = .5) +
scale_color_identity() +
theme_minimal() +
facet_grid(key ~ ., scales = "free_y", space = "free_y", switch = "y") +
theme(axis.line = element_line(linewidth = 0.5),
legend.position = "none",
panel.spacing = unit(0.75, "lines"),
axis.text = element_text(color = "black"),
strip.text.y.left = element_text(angle = 90),
strip.background = element_rect(fill="lightblue", colour="black",linewidth = 1),
strip.placement = "outside",
strip.text.x = element_blank()) +
labs(x = "Effect sizes (ranked)", y = "")
# Combine both plots
p <- plot_a / plot_b +
plot_layout(heights = c(1, 4))
ggsave("figures/distribution_of_effects.png", p, width = 23, height = 31, units = "cm")
p
For interpretability and comparability with earlier work, we started with univariate tests using the observed/coded data (which evidently results in different datasets for each moderator).
cat_moderators <- c("complexity", "interdep", "longevity", "meas_type", "rater", "design", "art_focus", "criterion")
cont_moderators <- c("year_merged", "collectivism", "power_distance")
expl_mods <- c("country_grp", "tmt", "stud_sample", "ind_sector", "team_function", "domain_and_sub", "objective_rater")
# To rename for reporting
var_names <- tibble::tribble(
~old, ~new,
"art_focus", "Article focus",
"pub_status", "Publication status",
"language", "Language",
"design", "Design",
"tmt", "TMT",
"stud_sample", "Student sample",
"meas_type", "Diversity measure",
"rater", "Performance rater",
"objective_rater", "Objective rater",
"criterion", "Performance criterion",
"interdep", "Interdependence",
"complexity", "Complexity",
"virtuality", "Virtuality",
"longevity", "Longevity",
"auth_diff", "Authority Differentiation",
"collectivism", "Collectivism",
"power_distance", "Power distance",
"ind_sector", "Industry sector",
"team_function", "Team function",
"country_grp", "Country",
"year_merged", "Year of data collection",
"domain_and_sub", "Sub-domain"
)
level_names <- tibble::tribble(
~var, ~level_old, ~level_new,
"art_focus", "focal H", "Focal hypothesis",
"art_focus", "auxiliary H", "Auxiliary hypothesis",
"art_focus", "descriptive", "Descriptive",
"pub_status", "Published", "Published",
"pub_status", "Masters Dissertation", "Masters Dissertation",
"pub_status", "Working paper/Preprint", "Working Paper/Preprint",
"pub_status", "Conference presentation", "Conference Presentation",
"pub_status", "PhD Dissertation", "PhD Dissertation",
"interdep", "high", "High",
"interdep", "medium", "Medium",
"interdep", "low", "Low",
"complexity", "high", "High",
"complexity", "medium", "Medium",
"complexity", "low", "Low",
"tmt", "yes", "Yes",
"tmt", "no", "No",
"stud_sample", "yes", "Yes",
"stud_sample", "no", "No",
"meas_type", "Variety", "Variety",
"meas_type", "Separation", "Separation",
"meas_type", "Other", "Other",
"design", "Experimental", "Experimental",
"design", "Observational", "Observational",
"objective_rater", "Objective", "Objective",
"objective_rater", "Subjective", "Subjective",
"rater", "Objective", "Objective",
"rater", "Subjective - self", "Subjective - Self",
"rater", "Subjective - supervisor", "Subjective - Supervisor",
"rater", "Subjective - external", "Subjective - External",
"virtuality", "physical", "Physical",
"virtuality", "hybrid-members", "Hybrid-Members",
"virtuality", "virtual", "Virtual",
"auth_diff", "high", "High",
"auth_diff", "mixed", "Mixed",
"auth_diff", "low", "Low",
"language", "chinese", "Chinese",
"language", "dutch", "Dutch",
"language", "english", "English",
"language", "french", "French",
"language", "german", "German",
"language", "indonesian", "Indonesian",
"language", "italian", "Italian",
"language", "japanese", "Japanese",
"language", "korean", "Korean",
"language", "portuguese", "Portuguese",
"language", "spanish", "Spanish",
"longevity", "hours", "Hours",
"longevity", "days", "Days",
"longevity", "weeks", "Weeks",
"longevity", "months", "Months",
"longevity", "years", "Years",
"longevity", "stable", "Stable"
)
single_cat_moderator_test <- function(moderator, data, min_group_k = FALSE) {
p <- progressor(7, message = paste("Now estimating", moderator))
if (min_group_k) {
data$`__mod__` <- data[[moderator]] # So that colname is known for use in dplyr pipe
outs <- data %>%
count(`__mod__`) %>%
filter(n < min_group_k) %>%
pull(`__mod__`)
data$`__mod_dom__` <- paste0(data$`__mod__`, data$domain)
domain_outs <- data %>%
count(`__mod_dom__`) %>%
filter(n < min_group_k) %>%
pull(`__mod_dom__`)
V <- with(
data %>% filter(!data[[moderator]] %in% outs),
impute_covariance_matrix(
vi = var_adj,
cluster = articlestudy,
r = rho
)
)
if (nrow(V) != nrow(data)) message("Excluded ", nrow(data) - nrow(V), " effect sizes with moderator levels below minimum group size")
Vd <- with(
data %>% filter(!data$`__mod_dom__` %in% domain_outs),
impute_covariance_matrix(
vi = var_adj,
cluster = articlestudy,
r = rho
)
)
if (nrow(Vd) != nrow(data)) message("Excluded ", nrow(data) - nrow(Vd), " effect sizes with moderator-domain combinations below minimum group size")
data <- data %>% filter(!data[[moderator]] %in% outs)
} else {
V <- with(
data,
impute_covariance_matrix(
vi = var_adj,
cluster = articlestudy,
r = rho
)
)
Vd <- V # Same matrix when there is no filtering
}
p()
no_levels <- data[[moderator]] %>%
unique() %>%
na.omit() %>%
length()
mod <- {
data[[moderator]] <- factor(data[[moderator]], ordered = FALSE)
mod <- rma.mv(as.formula(paste0("r_adj ~ 0 + ", moderator)),
V = V,
random = ~ 1 | articlestudy / effect_id,
test = "t",
dfs = "contain",
data = data,
sparse = TRUE
)
p()
if (min_group_k) data <- data %>% filter(!data$`__mod_dom__` %in% domain_outs)
mod_domain_no_int <- rma.mv(as.formula(paste0("r_adj ~ 0 + domain:", moderator)),
V = Vd,
random = ~ 1 | articlestudy / effect_id,
test = "t",
dfs = "contain",
data = data,
sparse = TRUE
)
p()
list(
mod = mod,
mod_domains_no_int = mod_domain_no_int
)
}
robust_across <- coef_test(mod$mod, vcov = "CR2")
robust_domain_no_int <- coef_test(mod$mod_domains_no_int, vcov = "CR2")
if (nrow(robust_across) > 20) message("Moderator has large number of levels - wald_test may take several minutes.")
# Test of moderator overall
res_tibble <- Wald_test(mod$mod, vcov = "CR2", constrain_equal(1:nrow(robust_across), coef(mod$mod))) %>% mutate(domain = "Overall")
p()
# Test of domains
if (no_levels * 3 == nrow(robust_domain_no_int)) {
for (i in 1:3) {
res_tibble <- bind_rows(res_tibble, Wald_test(mod$mod_domains_no_int, vcov = "CR2", constrain_equal(seq(i, no_levels * 3, by = 3), coef(mod$mod_domains_no_int))) %>%
mutate(domain = names(coef(mod$mod_domains_no_int))[i] %>% str_extract("[A-z]*")))
p()
}
} else {
# Some levels did not exist in all domains - in these cases, need to match by coefficient names
for (d in unique(data$domain)) {
res_tibble <- bind_rows(res_tibble, Wald_test(mod$mod_domains_no_int,
vcov = "CR2",
constrain_equal(which(str_detect(names(coef(mod$mod_domains_no_int)), d)), coef(mod$mod_domains_no_int))
) %>%
mutate(domain = d))
p()
}
}
list(moderator_test = res_tibble, robu_averages = robust_across, robu_domain_no_int = robust_domain_no_int, meta_models = mod)
}
single_cont_moderator_test <- function(moderator, data) {
p <- progressor(4, message = paste("Now estimating", moderator))
V <- with(
data,
impute_covariance_matrix(
vi = var_adj,
cluster = articlestudy,
r = rho
)
)
p()
mod <- {
mod <- rma.mv(as.formula(paste0("r_adj ~ ", moderator)),
V = V,
random = ~ 1 | articlestudy / effect_id,
test = "t",
dfs = "contain",
data = data,
sparse = TRUE
)
p()
mod_domains <- rma.mv(as.formula(paste0("r_adj ~ 0 + domain + domain:", moderator)),
V = V,
random = ~ 1 | articlestudy / effect_id,
test = "t",
dfs = "contain",
data = data,
sparse = TRUE
)
p()
mod_domains_no_int <- rma.mv(as.formula(paste0("r_adj ~ 0 + domain:", moderator)),
V = V,
random = ~ 1 | articlestudy / effect_id,
test = "t",
dfs = "contain",
data = data,
sparse = TRUE
)
p()
list(
mod = mod,
mod_domains = mod_domains,
mod_domains_no_int = mod_domains_no_int
)
}
list(
robust_across = coef_test(mod$mod, vcov = "CR2"),
robust_domain = coef_test(mod$mod_domains, vcov = "CR2"),
robust_domain_no_int = coef_test(mod$mod_domains_no_int, vcov = "CR2"),
meta_models = mod
)
}
cat_plots <- function(cats, minimum_articles = NULL, data = dataset) {
minart <- minimum_articles # Workaround a promise evaluation error
imap(cats, \(res, predictor_name, minimum_articles = minart) {
all_coefs <- bind_rows(
res$robu_domain_no_int %>% tibble(),
res$robu_averages %>% tibble()
)
if (!is.null(minimum_articles)) {
dataset_articles <- data %>%
group_by(articlestudy, !!sym(predictor_name)) %>%
slice_head(n = 1) %>%
ungroup()
all_vals <- dataset_articles[[predictor_name]] %>%
unique() %>%
na.omit()
out_overall <- setdiff(
all_vals,
dataset_articles[[predictor_name]] %>% table() %>% as_tibble() %>% filter(n >= minimum_articles) %>% pull(1)
) %>% paste0(predictor_name, .)
dataset_articles <- data %>%
group_by(articlestudy, domain, !!sym(predictor_name)) %>%
slice_head(n = 1) %>%
ungroup()
out_dem <- setdiff(
all_vals,
dataset_articles %>% filter(domain == "Demographic") %>% .[[predictor_name]] %>% table() %>%
as_tibble() %>% filter(n >= minimum_articles) %>% pull(1)
) %>% paste0("domainDemographic:", predictor_name, .)
out_job <- setdiff(
all_vals,
dataset_articles %>% filter(domain == "Job-related") %>% .[[predictor_name]] %>% table() %>%
as_tibble() %>% filter(n >= minimum_articles) %>% pull(1)
) %>% paste0("domainJob-related:", predictor_name, .)
out_cog <- setdiff(
all_vals,
dataset_articles %>% filter(domain == "Cognitive") %>% .[[predictor_name]] %>% table() %>%
as_tibble() %>% filter(n >= minimum_articles) %>% pull(1)
) %>% paste0("domainCognitive:", predictor_name, .)
all_out <- c(out_overall, out_dem, out_job, out_cog)
all_coefs <- all_coefs %>% filter(!Coef %in% all_out)
}
cur_levels <- level_names %>% filter(var == predictor_name)
df_preprocessed <- all_coefs %>%
mutate(
domain = coalesce(str_extract(Coef, "domain[^:]+") %>% str_remove("domain"), "**Overall**"),
Coef = str_remove(Coef, paste0(".*", predictor_name))
) %>%
select(coef = Coef, beta = beta, se = SE, df = df_Satt, p = p_Satt, domain)
if (nrow(cur_levels) > 0) {
df_preprocessed <- df_preprocessed %>%
rowwise() %>%
mutate(coef = cur_levels$level_new[cur_levels$level_old == coef] %>% factor(levels = rev(cur_levels$level_new))) %>%
ungroup()
}
p <- ggplot(df_preprocessed, aes(y = coef, x = beta, color = domain)) +
geom_point() +
geom_errorbarh(aes(xmin = beta - 1.96 * se, xmax = beta + 1.96 * se), height = 0.2) +
geom_vline(xintercept = 0, linetype = "dashed", linewidth = .5) +
geom_vline(xintercept = c(-.1, .1), linetype = "dotted", linewidth = .5) +
facet_wrap(~domain, scales = "free_y", ncol = 1) +
theme_minimal() +
labs(
x = "", y = "",
subtitle = var_names$new[var_names$old == predictor_name]
) +
theme(
axis.text.y = element_text(angle = 0, hjust = 1),
strip.text = element_markdown()
) +
scale_color_brewer(palette = "Set1") +
scale_fill_brewer(palette = "Set1") +
theme(legend.position = "none")
list(plot = p, coefs = df_preprocessed)
})
}
# Bootstrap (pseudo) R2 confidence intervals based on guidance by Wolfgang Viechtbauer
# https://www.metafor-project.org/doku.php/tips:ci_for_r2
# https://gist.github.com/wviechtb/6fbfca40483cb9744384ab4572639169
# Showed that for this, BCa CIs are best
boot_R2 <- function(dat, indices, moderator, other_moderators = NULL, progress = TRUE) {
if (progress) p()
data <- dat[indices, ]
# Drop rows where moderator or any of other_moderators are NA
if (!is.null(other_moderators)) {
data <- data %>% drop_na(all_of(c(moderator, other_moderators)))
} else {
data <- data %>% drop_na(all_of(moderator))
}
# Null models for R2
V <- with(
data,
impute_covariance_matrix(
vi = var_adj,
cluster = articlestudy,
r = rho
)
)
if (!is.null(other_moderators)) {
fml <- paste("r_adj ~", paste(other_moderators, collapse = " + "))
} else {
fml <- "r_adj ~ 1"
}
mod_null <- try(suppressWarnings(rma.mv(as.formula(fml),
V = V,
random = ~ 1 | articlestudy / effect_id,
test = "t",
dfs = "contain",
data = data,
sparse = TRUE
)))
mod_moderated <- try(suppressWarnings(rma.mv(as.formula(paste(fml, moderator, sep = " + ")),
V = V,
random = ~ 1 | articlestudy / effect_id,
test = "t",
dfs = "contain",
data = data,
sparse = TRUE
)))
if (inherits(mod_null, "try-error") || inherits(mod_moderated, "try-error")) {
NA
} else {
if (sum(mod_null$sigma2) > 0) {
max(0, 100 * (sum(mod_null$sigma2) - sum(mod_moderated$sigma2)) / sum(mod_null$sigma2))
} else {
# catch cases where sum(res0$sigma2) is 0, to avoid -Inf (if sum(res1$sigma2)
# is positive) or NaN (if sum(res1$sigma2) is also 0) for R^2; for such cases,
# define R^2 to be 0
0
}
}
}
dataset$objective_rater <- fct_collapse(dataset$rater, "Objective" = "Objective",
"Subjective" = c("Subjective - self", "Subjective - supervisor", "Subjective - external"))
if (file.exists("data/single_cat_moderator_tests.qs")) {
single_cat_moderator_tests <- qs::qread("data/single_cat_moderator_tests.qs")
if (any(names(single_cat_moderator_tests) != cat_moderators)) {
warning("File on disk does not match current list of moderators. Consider deleting it to re-estimate.")
}
} else {
single_cat_moderator_tests <- with_progress({
cat_moderators %>%
set_names() %>%
map(\(moderator) single_cat_moderator_test(moderator, dataset))
})
qs::qsave(single_cat_moderator_tests, "data/single_cat_moderator_tests.qs")
}
if (file.exists("data/single_cont_moderator_tests.rds")) {
single_cont_moderator_tests <- read_rds("data/single_cont_moderator_tests.rds")
if (any(names(single_cont_moderator_tests) != cont_moderators)) {
warning("File on disk does not match current list of moderators. Consider deleting it to re-estimate.")
}
} else {
single_cont_moderator_tests <- with_progress({
cont_moderators %>%
set_names() %>%
map(\(moderator) single_cont_moderator_test(moderator, dataset))
})
write_rds(single_cont_moderator_tests, "data/single_cont_moderator_tests.rds")
}
# Collapse countries so that there are at least 3 per group (otherwise, group estimates too noisy
# and Wald test fails to converge with more than 30 levels)
dataset <- dataset %>% mutate(
country_grp = fct_lump_min(country, 3),
domain_and_sub = paste(domain, sub_dom)
)
if (file.exists("data/single_expl_cat_moderator_tests.qs")) {
single_expl_cat_moderator_tests <- qs::qread("data/single_expl_cat_moderator_tests.qs")
if (any(names(single_expl_cat_moderator_tests) != expl_mods)) {
warning("File on disk does not match current list of moderators. Consider deleting it to re-estimate.")
}
} else {
single_expl_cat_moderator_tests <- with_progress({
expl_mods %>%
set_names() %>%
map(\(moderator) single_cat_moderator_test(moderator, dataset, min_group_k = 3))
})
qs::qsave(single_expl_cat_moderator_tests, "data/single_expl_cat_moderator_tests.qs")
}
cat_summary <- single_cat_moderator_tests %>%
map_dfr("moderator_test", .id = "moderator") %>%
mutate(
domain = str_remove(domain, "domain"),
domain = case_when(
domain == "Job" ~ "Job-related",
TRUE ~ domain
)
)
cont_summary <- bind_rows(
single_cont_moderator_tests %>% map_dfr("robust_across", .id = "moderator") %>% as_tibble() %>% mutate(domain = "Overall") %>% filter(Coef != "intrcpt"),
single_cont_moderator_tests %>% map_dfr("robust_domain", .id = "moderator") %>% as_tibble() %>% filter(str_detect(Coef, ":")) %>%
mutate(domain = str_extract(Coef, ".*:") %>% str_remove("domain") %>% str_remove(":"))
)
cat_exp_summary <- single_expl_cat_moderator_tests %>%
map_dfr("moderator_test", .id = "moderator") %>%
mutate(
domain = str_remove(domain, "domain"),
domain = case_when(
domain == "Job" ~ "Job-related",
TRUE ~ domain
)
)
ks <- single_cat_moderator_tests %>%
map_dbl(~ .x$meta_models$mod_domains$k) %>%
tibble(moderator = names(single_cat_moderator_tests), k = .) %>%
bind_rows(single_cont_moderator_tests %>% map_dbl(~ .x$meta_models$mod_domains$k) %>% tibble(moderator = names(single_cont_moderator_tests), k = .)) %>%
bind_rows(single_expl_cat_moderator_tests %>% map_dbl(~ .x$meta_models$mod_domains$k) %>% tibble(moderator = names(single_expl_cat_moderator_tests), k = .)) %>%
rowwise() %>%
mutate(moderator = var_names$new[var_names$old == moderator]) %>%
ungroup()
The code to bootstrap R2s for the moderators (for equivalence
testing) is very slow - it takes about 7-8 hours per moderator on an M2
Macbook Air. Therefore, it was run on AWC EC2 instances. The code used
is found in helpers/aws_code R2.R but based on the code
provided for reference below.
# Run as background job
reg_mods <- c(cat_moderators, cont_moderators)
if (file.exists("data/bs_R2s_reg_mods.rds")) {
bs_R2_reg_mods <- read_rds("data/bs_R2s_reg_mods.rds")
} else {
job::job("Bootstrapping R2 reg mods" = {
library(boot)
R <- 5000 # Needed for bca confidence intervals (see https://bugs.r-project.org/show_bug.cgi?id=18647), though that takes 7-8h per moderator (M2 Macbook)
system.time({
bs_R2s <- reg_mods %>% set_names() %>% map(\(moderator) {
set.seed(1234)
if (!file.exists(glue::glue("data/bs_R2s_{moderator}.rds"))) {
message("Now bootstrapping ", moderator)
bs <- boot(dataset, boot_R2, R=R, moderator = moderator, ncpus = 6, parallel = "multicore")
write_rds(bs, glue::glue("data/bs_R2s_{moderator}.rds"))
bs
} else {
read_rds(glue::glue("data/bs_R2s_{moderator}.rds"))
}
})
})
write_rds(bs_R2s, "data/bs_R2s_reg_mods.rds")
}, import = c(rho, dataset, boot_R2, reg_mods))
message("Registered R2s will be bootstrapped now in background job - rerun this chunk afterwards to load them. Note that this is very slow.")
}
if (file.exists("data/bs_R2s_expl_mods.rds")) {
bs_R2_expl_mods <- read_rds("data/bs_R2s_expl_mods.rds")
} else {
job::job("Bootstrapping R2 reg mods" = {
library(boot)
R <- 5000 # Needed for bca confidence intervals (see https://bugs.r-project.org/show_bug.cgi?id=18647), though that takes 7-8h per moderator (M2 Macbook)
system.time({
bs_R2s <- expl_mods %>% set_names() %>% map(\(moderator) {
set.seed(1234)
if (!file.exists(glue::glue("data/bs_R2s_{moderator}.rds"))) {
message("Now bootstrapping ", moderator)
bs <- boot(dataset, boot_R2, R=R, moderator = moderator, progress = FALSE, ncpus = 5, parallel = "multicore")
write_rds(bs, glue::glue("data/bs_R2s_{moderator}.rds"))
bs
} else {
read_rds(glue::glue("data/bs_R2s_{moderator}.rds"))
}
})
})
write_rds(bs_R2s, "data/bs_R2s_expl_mods.rds")
}, import = c(rho, dataset, boot_R2, expl_mods))
message("Exploratory R2s will be bootstrapped now in background job - rerun this chunk afterwards to load them. Note that this is very slow.")
}
library(boot)
if (!file.exists("data/mod_R2_equivalence.qs")) {
job::job("Equivalence testing for moderator R2s" = {
mod_R2s <- imap_dfr(bs_R2_reg_mods, .id = "moderator", \(R2s, moderator) {
# Repeated calculation of bca confidence intervals takes ca. 10 mins per moderator
cli::cli_alert_info("Now processing {moderator}")
bs_type <- "bca"
R2 <- R2s$t0
ci <- try(extract_boot_ci(boot.ci(R2s, .95, type = "bca")), silent = TRUE)
if (inherits(ci, "try-error")) {
bs_type <- "perc"
cli::cli_alert_warning("Could not estimate bca confidence interval for {moderator} - falling back to percentile.")
ci <- boot.ci(R2s, .95, type = "perc") %>% extract_boot_ci()
}
p_equiv <- boot_get_equiv_p(R2s, 5, ci_type = bs_type)
tibble(R2 = R2, R2_ci = glue::glue("[{round_(ci$CI_L, 2)}%, {round_(ci$CI_U, 2)}%]"), p_equiv = p_equiv, ci_list = list(ci))
})
qs::qsave(mod_R2s, "data/mod_R2_equivalence.qs")
})
message("Equivalence testing for registered moderator R2s will be run now in background job - rerun this chunk afterwards to load them. This should take about 90 minutes.")
} else {
mod_R2s <- qs::qread("data/mod_R2_equivalence.qs")
}
R2s <- read_rds("data/bs_R2s_objective_rater.rds")
if (!file.exists("data/mod_R2_equivalence_expl.qs")) {
job::job("Equivalence testing for exploratory moderator R2s" = {
mod_R2s <- imap_dfr(bs_R2_expl_mods, .id = "moderator", \(R2s, moderator) {
# Repeated calculation of bca confidence intervals takes ca. 10 mins per moderator
cli::cli_alert_info("Now processing {moderator}")
bs_type <- "bca"
R2 <- R2s$t0
ci <- try(extract_boot_ci(boot.ci(R2s, .95, type = "bca")), silent = TRUE)
if (inherits(ci, "try-error")) {
bs_type <- "perc"
cli::cli_alert_warning("Could not estimate bca confidence interval for {moderator} - falling back to percentile.")
ci <- boot.ci(R2s, .95, type = "perc") %>% extract_boot_ci()
}
p_equiv <- boot_get_equiv_p(R2s, 5, ci_type = bs_type)
tibble(R2 = R2, R2_ci = glue::glue("[{round_(ci$CI_L, 2)}%, {round_(ci$CI_U, 2)}%]"), p_equiv = p_equiv, ci_list = list(ci))
})
qs::qsave(mod_R2s, "data/mod_R2_equivalence_expl.qs")
})
message("Equivalence testing for exploratory moderator R2s will be run now in background job - rerun this chunk afterwards to load them. This should take about 90 minutes.")
} else {
mod_R2s_expl <- qs::qread("data/mod_R2_equivalence_expl.qs")
}
mod_R2s <- bind_rows(mod_R2s, mod_R2s_expl)
mod_table <- bind_rows(cat_summary %>% select(moderator, p_val, domain),
cont_summary %>% select(moderator, p_val = p_Satt, domain),
cat_exp_summary %>% select(moderator, p_val, domain)) %>%
left_join(mod_R2s, by = "moderator") %>%
filter(moderator != "domain_and_sub") %>%
rowwise() %>%
mutate(p_fmt = paste(sigstars(p_val)) %>% str_replace_all("†", "†"), #.docx fails with this HTML code in body
moderator = var_names$new[var_names$old == moderator]) %>%
select(-p_val) %>%
pivot_wider(names_from = "domain", values_from = "p_fmt") %>%
left_join(ks, by = "moderator") %>%
transmute(Moderator = moderator, k, Overall, Demographic, Cognitive, `Job-related`,
`R<sup>2</sup>` = glue::glue("{round_(R2, 2)}% {R2_ci}"), `R<sup>2</sup> < 5%` = fmt_p(p_equiv, equal_sign = FALSE)) %>%
gt::gt() %>%
tab_header(title = "Univariate moderator tests") %>%
tab_spanner(md("Significance tests"), Overall:`Job-related`) %>%
tab_spanner(md("Overall effect size"), id = "overall-spanner", `R<sup>2</sup>`:`R<sup>2</sup> < 5%`) %>%
cols_label(.fn = md) %>%
gt::tab_source_note(timesaveR:::std_stars %>% {glue::glue_collapse(paste(names(.),.), ", ")} %>% html()) %>%
gt_apa_style(fmt_labels_md = TRUE)
gtsave(mod_table, "tables/moderator_tests.docx")
mod_table
| Univariate moderator tests | |||||||
| Moderator | k | Significance tests | Overall effect size | ||||
|---|---|---|---|---|---|---|---|
| Overall | Demographic | Cognitive | Job-related | R2 | R2 < 5% | ||
Complexity |
2393 | * | * | 0.00% [0.00%, 0.36%] | < .001 | ||
Interdependence |
2229 | 0.89% [0.66%, 2.28%] | < .001 | ||||
Longevity |
2298 | * | 0.15% [0.00%, 0.75%] | < .001 | |||
Diversity measure |
2609 | 0.04% [0.00%, 0.76%] | < .001 | ||||
Performance rater |
2623 | * | 0.00% [0.00%, 0.09%] | < .001 | |||
Design |
2638 | 0.00% [0.00%, 1.06%] | < .001 | ||||
Article focus |
2638 | ** | * | †| 0.77% [0.49%, 1.84%] | < .001 | |
Performance criterion |
268 | 0.00% [0.00%, 4.36%] | .031 | ||||
Year of data collection |
2638 | †| * | 0.08% [0.00%, 0.63%] | < .001 | ||
Collectivism |
2328 | ** | 0.20% [0.04%, 0.99%] | < .001 | |||
Power distance |
2328 | * | * | 0.59% [0.23%, 2.16%] | < .001 | ||
Country |
2596 | * | 3.91% [1.51%, 6.89%] | .392 | |||
TMT |
2638 | †| * | 0.15% [0.01%, 0.96%] | < .001 | ||
Student sample |
2638 | 0.09% [0.00%, 0.39%] | < .001 | ||||
Industry sector |
2397 | 2.19% [1.26%, 5.90%] | .257 | ||||
Team function |
2492 | * | 2.96% [2.56%, 5.13%] | .755 | |||
Objective rater |
2623 | * | ** | 0.00% [0.00%, 0.07%] | < .001 | ||
| †0.1, * 0.05, ** 0.01, *** 0.001 | |||||||
sig_mods <- c(single_cat_moderator_tests[c(1, 3, 7, 5)], single_expl_cat_moderator_tests[7], single_expl_cat_moderator_tests[2])
many_cats <- single_expl_cat_moderator_tests[c(1, 5)]
ns_mods <- c(single_cat_moderator_tests[c(2, 4, 6, 8)], single_expl_cat_moderator_tests[3])
main_plots <- cat_plots(sig_mods)
tall_plots <- cat_plots(many_cats, minimum_articles = 5)
ns_plots <- cat_plots(ns_mods, minimum_articles = 5)
p <- wrap_plots(map(main_plots, "plot"), tag_level = "new") +
plot_annotation(tag_levels = 'A', title = "Estimated correlations depending on moderator levels",
subtitle = "Estimated from univariate meta-regressions with cluster-robust standard errors",
caption = "95% confidence intervals shown with error bars. Dotted lines indicate thresholds for small effects (|r| < .1).")
p2 <- wrap_plots(map(tall_plots, "plot"), tag_level = "new") +
plot_annotation(tag_levels = list(c('G', 'H')), title = "Estimated correlations depending on moderator levels",
subtitle = "Estimated from univariate meta-regressions with cluster-robust standard errors",
caption = "95% confidence intervals shown with error bars. Dotted lines indicate thresholds for small effects (|r| < .1).\nLevels (and domain-level combinations) based on fewer than 5 samples are not shown.")
p3 <- wrap_plots(map(ns_plots, "plot"), tag_level = "new") +
plot_annotation(tag_levels = list(LETTERS[9:(9+length(ns_plots))]), title = "Estimated correlations for non-significant moderators (in SM)",
subtitle = "Estimated from univariate meta-regressions with cluster-robust standard errors",
caption = "95% confidence intervals shown with error bars. Dotted lines indicate thresholds for small effects (|r| < .1).\nLevels (and domain-level combinations) based on fewer than 5 samples are not shown.")
coefs <- bind_rows(main_plots %>% map_dfr("coefs", .id = "moderator"),
tall_plots %>% map_dfr("coefs", .id = "moderator"),
ns_plots %>% map_dfr("coefs", .id = "moderator")) %>% select(moderator, domain, everything()) %>%
rowwise() %>% mutate(moderator = var_names$new[var_names$old == moderator]) %>% ungroup()
ggsave("figures/moderated_estimates_cat.png", p, width = 9, height = 13)
ggsave("figures/moderated_estimates_tall.png", p2, width = 9, height = 11)
p
p2
p3
coefs %>%
rename(`*r*` = beta, SE = se, zdf = df, `zz*p*` = p) %>%
pivot_wider(names_from = "domain", values_from = c(`*r*`, SE, zdf, `zz*p*`), names_glue = "{domain}_{.value}") %>%
group_by(moderator) %>%
select(moderator, coef, sort(names(.))) %>% gt() %>%
tab_spanner_delim("_") %>%
cols_label(coef = "") %>%
cols_label(.list = stats::setNames(lapply(unlist(.$`_boxhead`$column_label),
\(x) str_remove_all(x, "z") %>% md()), names(.$`_data`))) %>%
fmt(columns = matches(fixed("*r*")), fns = \(x) fmt_cor(x, digits = 3)) %>%
fmt(columns = matches(fixed("*p*")), fns = \(x) fmt_p(x, equal_sign = FALSE)) %>%
fmt_number(columns = matches("df"), decimals = 0) %>%
fmt_number(columns = matches("SE"), decimals = 2) %>%
gt_apa_style() %>%
cols_align("right", coef) %>%
tab_header(title = "Estimated correlations for categorical moderator levels") %>%
tab_source_note("NB: *df* are adjusted to robustly account for clustering, they do not indicate the number of observations. NA are shown where there were insufficient observations to estimate that value.")
| Estimated correlations for categorical moderator levels | ||||||||||||||||
| **Overall** | Cognitive | Demographic | Job-related | |||||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| r | SE | df | p | r | SE | df | p | r | SE | df | p | r | SE | df | p | |
| Complexity | ||||||||||||||||
| High | .027 | 0.00 | 261 | < .001 | .038 | 0.01 | 132 | < .001 | .017 | 0.01 | 122 | .021 | .034 | 0.01 | 202 | < .001 |
| Low | .027 | 0.03 | 6 | .451 | -.093 | 0.04 | 6 | .041 | .066 | 0.03 | 3 | .095 | -.049 | 0.02 | 2 | .145 |
| Medium | .025 | 0.02 | 42 | .138 | .001 | 0.03 | 25 | .958 | .002 | 0.01 | 32 | .878 | .103 | 0.04 | 17 | .031 |
| Longevity | ||||||||||||||||
| Days | .016 | 0.04 | 3 | .672 | .079 | 0.13 | 4 | .576 | .021 | 0.03 | 2 | .531 | -.057 | 0.03 | 2 | .205 |
| Hours | .017 | 0.03 | 18 | .568 | .042 | 0.06 | 8 | .527 | -.007 | 0.02 | 14 | .718 | .103 | 0.01 | 1 | .057 |
| Months | .035 | 0.02 | 45 | .056 | .018 | 0.03 | 19 | .602 | .018 | 0.02 | 27 | .436 | .074 | 0.04 | 17 | .058 |
| Stable | .019 | 0.01 | 161 | < .001 | .017 | 0.01 | 102 | .148 | .010 | 0.01 | 108 | .223 | .031 | 0.01 | 149 | .001 |
| Weeks | -.047 | 0.03 | 15 | .083 | -.114 | 0.13 | 6 | .417 | -.022 | 0.02 | 16 | .377 | -.077 | 0.04 | 7 | .113 |
| Years | .039 | 0.02 | 62 | .020 | .070 | 0.03 | 24 | .011 | .016 | 0.03 | 43 | .582 | .050 | 0.03 | 43 | .092 |
| Article focus | ||||||||||||||||
| Auxiliary hypothesis | .032 | 0.01 | 84 | < .001 | .021 | 0.01 | 61 | .162 | .022 | 0.01 | 45 | .106 | .051 | 0.02 | 78 | .003 |
| Descriptive | .001 | 0.01 | 108 | .893 | -.001 | 0.02 | 48 | .956 | -.007 | 0.01 | 89 | .365 | .017 | 0.01 | 64 | .094 |
| Focal hypothesis | .031 | 0.01 | 167 | < .001 | .029 | 0.02 | 86 | .068 | .021 | 0.01 | 97 | .044 | .047 | 0.01 | 115 | < .001 |
| Performance rater | ||||||||||||||||
| Objective | .024 | 0.01 | 183 | < .001 | .022 | 0.01 | 106 | .048 | .024 | 0.01 | 104 | < .001 | .024 | 0.01 | 148 | .020 |
| Subjective - External | .020 | 0.02 | 51 | .261 | .009 | 0.04 | 24 | .836 | -.002 | 0.02 | 37 | .906 | .074 | 0.03 | 17 | .028 |
| Subjective - Self | .020 | 0.01 | 80 | .162 | .030 | 0.03 | 33 | .301 | -.013 | 0.02 | 67 | .549 | .057 | 0.03 | 53 | .028 |
| Subjective - Supervisor | .031 | 0.01 | 85 | .018 | .003 | 0.02 | 37 | .860 | -.000 | 0.02 | 63 | > .99 | .094 | 0.02 | 58 | < .001 |
| Objective rater | ||||||||||||||||
| Objective | .024 | 0.01 | 180 | < .001 | .022 | 0.01 | 105 | .047 | .024 | 0.01 | 102 | < .001 | .024 | 0.01 | 148 | .020 |
| Subjective | .024 | 0.01 | 195 | .005 | .016 | 0.02 | 80 | .345 | -.005 | 0.01 | 156 | .644 | .075 | 0.01 | 117 | < .001 |
| TMT | ||||||||||||||||
| No | .028 | 0.01 | 143 | < .001 | .002 | 0.01 | 92 | .870 | .019 | 0.01 | 91 | .040 | .065 | 0.02 | 114 | < .001 |
| Yes | .020 | 0.01 | 176 | < .001 | .034 | 0.01 | 98 | .007 | .007 | 0.01 | 126 | .408 | .025 | 0.01 | 136 | .007 |
| Country | ||||||||||||||||
| Australia | .025 | 0.03 | 9 | .491 | NA | NA | NA | NA | .002 | 0.04 | 8 | .947 | -.024 | 0.03 | 4 | .496 |
| China | .014 | 0.01 | 88 | .077 | .017 | 0.02 | 60 | .321 | -.006 | 0.01 | 71 | .664 | .039 | 0.02 | 75 | .018 |
| Germany | .041 | 0.03 | 14 | .132 | .076 | 0.05 | 4 | .234 | .033 | 0.03 | 15 | .210 | .049 | 0.12 | 4 | .696 |
| Israel | .164 | 0.07 | 9 | .045 | .163 | 0.04 | 4 | .015 | NA | NA | NA | NA | .188 | 0.09 | 8 | .082 |
| Italy | .020 | 0.05 | 8 | .671 | NA | NA | NA | NA | -.073 | 0.06 | 4 | .319 | .064 | 0.05 | 5 | .269 |
| Japan | -.012 | 0.03 | 4 | .660 | NA | NA | NA | NA | -.016 | 0.05 | 4 | .758 | NA | NA | NA | NA |
| Multiple | .041 | 0.01 | 28 | < .001 | .065 | 0.03 | 14 | .042 | .039 | 0.01 | 17 | .005 | .032 | 0.02 | 25 | .087 |
| Netherlands | -.002 | 0.03 | 17 | .935 | -.054 | 0.05 | 6 | .351 | -.004 | 0.05 | 11 | .939 | .038 | 0.07 | 8 | .592 |
| South Korea | .002 | 0.02 | 21 | .889 | -.013 | 0.04 | 11 | .758 | -.013 | 0.02 | 19 | .532 | .062 | 0.04 | 12 | .119 |
| Spain | .025 | 0.05 | 10 | .601 | .012 | 0.08 | 5 | .880 | .013 | 0.05 | 10 | .786 | .049 | 0.05 | 8 | .353 |
| Taiwan | .028 | 0.02 | 30 | .164 | -.019 | 0.03 | 19 | .489 | .074 | 0.05 | 9 | .201 | .060 | 0.03 | 16 | .059 |
| Turkey | .432 | 0.11 | 4 | .025 | NA | NA | NA | NA | NA | NA | NA | NA | .389 | 0.11 | 3 | .034 |
| United Kingdom | .096 | 0.05 | 7 | .105 | NA | NA | NA | NA | .024 | 0.04 | 4 | .592 | .248 | 0.06 | 5 | .007 |
| United States | .022 | 0.01 | 69 | .010 | .042 | 0.02 | 47 | .013 | .015 | 0.01 | 41 | .215 | .018 | 0.01 | 65 | .099 |
| Other | .051 | 0.06 | 7 | .396 | NA | NA | NA | NA | .029 | 0.07 | 7 | .695 | NA | NA | NA | NA |
| India | -.094 | 0.09 | 3 | .358 | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA |
| Team function | ||||||||||||||||
| Healthcare | .087 | 0.04 | 13 | .064 | NA | NA | NA | NA | .002 | 0.08 | 4 | .984 | .108 | 0.05 | 14 | .039 |
| Management | .019 | 0.01 | 182 | < .001 | .034 | 0.01 | 105 | .005 | .006 | 0.01 | 132 | .448 | .024 | 0.01 | 140 | .008 |
| Military personell | -.010 | 0.03 | 5 | .760 | NA | NA | NA | NA | -.002 | 0.06 | 3 | .975 | NA | NA | NA | NA |
| Mixed | -.005 | 0.01 | 43 | .721 | -.052 | 0.02 | 28 | .041 | -.013 | 0.02 | 41 | .519 | .063 | 0.02 | 26 | .016 |
| Other | .085 | 0.02 | 7 | .003 | .103 | 0.05 | 4 | .098 | .078 | 0.02 | 4 | .011 | .097 | 0.04 | 12 | .045 |
| Production/manual work | .011 | 0.04 | 3 | .788 | .013 | 0.03 | 3 | .717 | -.001 | 0.03 | 4 | .957 | NA | NA | NA | NA |
| Prof services / consultancy | .045 | 0.04 | 4 | .271 | NA | NA | NA | NA | .022 | 0.06 | 3 | .725 | NA | NA | NA | NA |
| R & D / Research | .057 | 0.01 | 23 | < .001 | .042 | 0.02 | 19 | .083 | .040 | 0.01 | 9 | .007 | .101 | 0.02 | 34 | < .001 |
| Sales | .037 | 0.04 | 7 | .326 | -.053 | 0.07 | 3 | .484 | .008 | 0.04 | 5 | .835 | .124 | 0.10 | 4 | .304 |
| Software development | -.045 | 0.04 | 14 | .322 | -.195 | 0.21 | 7 | .388 | -.045 | 0.05 | 11 | .423 | .000 | 0.04 | 7 | > .99 |
| Sports players | .026 | 0.03 | 17 | .417 | NA | NA | NA | NA | .051 | 0.03 | 17 | .140 | NA | NA | NA | NA |
| Interdependence | ||||||||||||||||
| High | .023 | 0.00 | 280 | < .001 | .028 | 0.01 | 145 | .013 | .012 | 0.01 | 209 | .101 | .034 | 0.01 | 191 | < .001 |
| Low | .017 | 0.04 | 7 | .658 | NA | NA | NA | NA | -.012 | 0.03 | 7 | .703 | .070 | 0.08 | 5 | .432 |
| Medium | .056 | 0.03 | 10 | .072 | .046 | 0.08 | 9 | .594 | .050 | 0.03 | 6 | .107 | .076 | 0.04 | 18 | .063 |
| Diversity measure | ||||||||||||||||
| Other | .028 | 0.01 | 28 | .021 | .093 | 0.05 | 8 | .116 | .019 | 0.01 | 22 | .119 | .047 | 0.04 | 12 | .261 |
| Separation | .014 | 0.01 | 190 | .049 | -.005 | 0.02 | 79 | .811 | .011 | 0.01 | 97 | .195 | .028 | 0.01 | 123 | .034 |
| Variety | .029 | 0.01 | 288 | < .001 | .028 | 0.01 | 123 | .013 | .013 | 0.01 | 162 | .188 | .053 | 0.01 | 186 | < .001 |
| Design | ||||||||||||||||
| Experimental | -.025 | 0.04 | 21 | .579 | -.038 | 0.08 | 14 | .648 | -.017 | 0.03 | 11 | .511 | NA | NA | NA | NA |
| Observational | .025 | 0.00 | 310 | < .001 | .023 | 0.01 | 180 | .015 | .014 | 0.01 | 206 | .028 | .042 | 0.01 | 251 | < .001 |
| Performance criterion | ||||||||||||||||
| Convergence | .066 | 0.03 | 22 | .046 | .063 | 0.07 | 6 | .429 | .008 | 0.05 | 11 | .859 | .127 | 0.05 | 12 | .037 |
| Divergence | .093 | 0.04 | 24 | .018 | .086 | 0.09 | 6 | .370 | .080 | 0.06 | 9 | .195 | .103 | 0.04 | 17 | .013 |
| Production | .020 | 0.04 | 35 | .569 | .048 | 0.10 | 11 | .644 | -.012 | 0.03 | 24 | .642 | .053 | 0.03 | 20 | .093 |
| Student sample | ||||||||||||||||
| No | .025 | 0.00 | 295 | < .001 | .023 | 0.01 | 171 | .018 | .014 | 0.01 | 192 | .034 | .042 | 0.01 | 248 | < .001 |
| Yes | -.005 | 0.02 | 40 | .823 | -.017 | 0.04 | 26 | .712 | .002 | 0.02 | 30 | .911 | -.000 | 0.06 | 6 | > .99 |
| NB: *df* are adjusted to robustly account for clustering, they do not indicate the number of observations. NA are shown where there were insufficient observations to estimate that value. | ||||||||||||||||
predict_robust <- function(mod, moderator, include_domains = FALSE) {
# From Viechtbauer - https://stat.ethz.ch/pipermail/r-sig-meta-analysis/2021-October/003516.html
rob <- robust(mod, cluster = dataset$articlestudy)
tmp1 <- coef_test(mod, vcov = "CR2")
tmp2 <- conf_int(mod, vcov = "CR2")
tmp3 <- Wald_test(mod, constraints = constrain_zero(mod$btt), vcov = "CR2")
# force those results into 'sav'
rob$b <- rob$beta <- tmp1$beta
rob$se <- tmp1$SE
rob$zval <- tmp1$tstat
rob$ddf <- tmp1$df
rob$pval <- tmp1$p_Satt
rob$ci.lb <- tmp2$CI_L
rob$ci.ub <- tmp2$CI_U
rob$vb <- vcovCR(mod, type = "CR2")
rob$QM <- tmp3$Fstat
rob$QMdf <- c(tmp3$df_num, round(tmp3$df_denom, 2))
rob$QMp <- tmp3$p_val
if (include_domains) {
mod_vals <- map(seq(min(dataset[[moderator]], na.rm = TRUE), max(dataset[[moderator]], na.rm = TRUE)), ~ {
matrix(c(1, 0, 0, 0, 1, 0, 0, 0, 1), nrow = 3, byrow = TRUE) %>%
cbind(., . * .x)
}) %>% do.call(rbind, .)
out <- predict(rob, newmods = mod_vals) %>% as_tibble()
colnames(mod_vals) <- names(coef(mod))
out <- cbind(out, mod_vals %>% as_tibble())
} else {
out <- predict(rob, newmods = seq(min(dataset[[moderator]], na.rm = TRUE), max(dataset[[moderator]], na.rm = TRUE))) %>% as_tibble()
out[[moderator]] <- seq(min(dataset[[moderator]], na.rm = TRUE), max(dataset[[moderator]], na.rm = TRUE))
}
out
}
create_cont_chart <- function(moderator) {
preds <- predict_robust(single_cont_moderator_tests[[moderator]]$meta_models$mod, moderator)
preds_dom <- predict_robust(single_cont_moderator_tests[[moderator]]$meta_models$mod_domains, moderator, TRUE)
# Combine predictions for domains and overall
all_coefs <- preds_dom %>%
mutate(domain = case_when(
domainDemographic == 1 ~ "Demographic",
domainCognitive == 1 ~ "Cognitive",
`domainJob-related` == 1 ~ "Job-related"
)) %>%
mutate(!!moderator := case_when(
domainDemographic == 1 ~ !!sym(paste0("domainDemographic:", moderator)),
domainCognitive == 1 ~ !!sym(paste0("domainCognitive:", moderator)),
`domainJob-related` == 1 ~ !!sym(paste0("domainJob-related:", moderator))
)) %>%
select(-starts_with("domain.+")) %>%
bind_rows(preds %>% mutate(domain = "Overall"), .)
mod_range <- dataset %>%
group_by(domain) %>%
summarise(
min_mod = min(!!sym(moderator), na.rm = TRUE),
max_mod = max(!!sym(moderator), na.rm = TRUE)
) %>%
bind_rows(
dataset %>% summarise(domain = "Overall", min_mod = min(!!sym(moderator), na.rm = TRUE), max_mod = max(!!sym(moderator), na.rm = TRUE))
)
filtered_coefs <- all_coefs %>%
left_join(mod_range, by = "domain") %>%
filter(!!sym(moderator) >= min_mod & !!sym(moderator) <= max_mod) %>%
select(-min_mod, -max_mod) %>%
mutate(domain = factor(domain, levels = c("Overall", "Cognitive", "Demographic", "Job-related")))
# Generate the plot
ggplot(filtered_coefs, aes(x = !!sym(moderator), y = pred)) +
geom_line(aes(color = domain)) +
geom_ribbon(aes(ymin = pred - 1.96 * se, ymax = pred + 1.96 * se, fill = domain), alpha = 0.2) +
geom_hline(yintercept = c(-0.1, .1), linetype = "dotted", linewidth = .5) +
geom_hline(yintercept = 0, linetype = "dashed", linewidth = .5) +
labs(x = "", subtitle = var_names$new[var_names$old == moderator], y = "*r*") +
theme_minimal() +
theme(
axis.title.y = element_markdown(),
legend.position = "none"
) +
scale_color_brewer(palette = "Set1") +
scale_fill_brewer(palette = "Set1") +
facet_wrap(~domain)
}
ps <- list(create_cont_chart("year_merged"),
create_cont_chart("collectivism"),
create_cont_chart("power_distance"))
layout <- ("
AAAAAAAAEDDDD
BBBBBBBBEDDDD
CCCCCCCCEDDDD
")
p <- ps[[1]] + ps[[2]] + ps[[3]] +
(tall_plots[[2]]$plot + labs(caption = "Only functions investigated in at least 5 samples are shown.")) +
plot_spacer() + plot_layout(design = layout) +
plot_annotation(tag_levels = list(c('G', 'H', "I", "J")), title = "Estimated correlations depending on moderator value",
subtitle = "Estimated from univariate meta-regressions with cluster-robust standard errors",
caption = "NB: Lengths of the regression line corresponds to observed range of the data.")
ggsave("figures/moderated_estimates_cont.png", p, width = 9, height = 12)
p
To impute data for the meta-regression, we had to exclude some variables where imputation would not make sense:
if (file.exists("data/imputed_data.rds")) {
imp <- read_rds("data/imputed_data.rds")
} else {
job::job("impute_data" = {
mi_vars <- c("art_focus", "pub_status", "design", "setting", "ind_sector", "team_function", "country", "n_teams", "stud_sample", "tmt", "domain", "sub_dom", "meas_type", "rater", "interdep", "complexity", "longevity", "power_distance", "collectivism", "year_merged", "citation_count", "criterion")
convert_to_factor <- function(x) {
if (is.character(x)) {
return(as.factor(x))
} else {
return(x)
}
}
dataset <- dataset %>%
mutate(across(any_of(mi_vars), convert_to_factor))
# Specify ordered factors
dataset$longevity <- factor(dataset$longevity, levels = c("hours", "days", "weeks", "months", "years", "stable"), ordered = TRUE)
dataset$interdep <- factor(dataset$interdep, levels = c("low", "medium", "high"), ordered = TRUE)
dataset$complexity <- factor(dataset$complexity, levels = c("low", "medium", "high"), ordered = TRUE)
# Get matrices
imp <- mice(dataset, maxit = 0, seed = 1526)
# Set variables of interest to be predicted by each other, all others ignored
preds <- imp$predictorMatrix
preds[] <- 0
preds[mi_vars, mi_vars] <- 1
# Use other model variables as predictors (as per mice vignette guidance)
preds[mi_vars, "r_adj"] <- 1
preds[mi_vars, "se"] <- 1
# Standard methods per variable type are generally good (polyreg for factors, polr for ordered, pmm for continuous)
# but polyreg does not work well with factors with many levels - so these set to pmm
methods <- imp$method
methods[c("ind_sector", "team_function", "country", "sub_dom")] <- "pmm"
methods[!names(methods) %in% mi_vars] <- ""
m <- 100 # Based on Van Buren, to attain maximum power: https://stefvanbuuren.name/fimd/sec-howmany.html
domains <- dataset$domain
# Ensure that sub-domains are only imputed within each domain (otherwise get invalid imputations)
# Are missing very rarely (usually only for education level vs degree)
mice.impute.imp_sub_domain <- function(y, ry, x, ...) {
# y is the vector to be imputed
# ry is a logical vector indicating which values in y are missing
# x is the matrix of predictor variables, already filtered by predMatrix
dots <- list(...)
# Domain information cannot be extracted from x as it is dummy-coded there, so take from global env
unique_domains <- unique(domains)
imputed <- rep(NA, sum(dots$wy)) # Start with y and replace missing values
imps <- list()
for (d in unique_domains) {
# Subset data for the current domain
rows_in_domain <- which(domains == d) # Rows in this domain where y is missing
y_sub <- y[rows_in_domain]
x_sub <- x[rows_in_domain, ]
ry_sub <- ry[rows_in_domain]
dots_sub <- dots
dots_sub$wy <- dots_sub$wy[rows_in_domain]
# Apply polyreg imputation method for the current domain
args_list <- c(list(y_sub, ry_sub, x_sub), dots_sub)
imp <- do.call("mice.impute.polyreg", args_list)
# Assign the imputed values back to the result vector
imps[d] <- list(imp)
}
# Order imps content based on domains and locations specified in wy
ds <- domains[dots$wy]
for (d in unique_domains) {
imputed[ds == d] <- unlist(imps[d])
}
return(imputed)
}
methods["sub_dom"] <- "imp_sub_domain"
imp <- mice(dataset, m = m, seed = 1526, predictorMatrix = preds, method = methods)
# Save the imputed data
write_rds(imp, "data/imputed_data.rds")
})
}
# From https://www.jepusto.com/mi-with-clubsandwich/
pool_robust <- function(models) {
models %>%
map(coef_test, vcov = "CR2") %>%
# add coefficient names as a column
lapply(function(x) {
x$coef <- row.names(x)
x
}) %>%
bind_rows() %>%
as.data.frame() %>%
# summarize by coefficient
group_by(coef) %>%
summarise(
m = n(),
B = var(beta),
beta_bar = mean(beta),
V_bar = mean(SE^2),
eta_bar = mean(df_Satt)
) %>%
mutate(
# calculate intermediate quantities to get df
V_total = V_bar + B * (m + 1) / m,
gamma = ((m + 1) / m) * B / V_total,
df_m = (m - 1) / gamma^2,
df_obs = eta_bar * (eta_bar + 1) * (1 - gamma) / (eta_bar + 3),
df = 1 / (1 / df_m + 1 / df_obs),
# calculate summary quantities for output
se = sqrt(V_total),
t = beta_bar / se,
p_val = 2 * pt(abs(t), df = df, lower.tail = FALSE),
crit = qt(0.975, df = df),
lo95 = beta_bar - se * crit,
hi95 = beta_bar + se * crit
) %>%
select(coef, est = beta_bar, se, t, df, p_val, lo95, hi95, gamma)
}
pooled_Wald <- function(models, which_coefs) {
# Get coefficients
coefs <- models %>% map(coef)
# Get CR2 variance-covariance matrices
vcovs <- models %>% map(vcovCR, type = "CR2")
ref_coefs <- coefs[[1]] %>%
names() %>%
str_detect(., which_coefs) %>%
which()
message("For ", which_coefs, " now testing: ", coefs[[1]] %>% names() %>% .[ref_coefs] %>% paste(collapse = " "))
# Get denominator df from complete-data F-tests
q <- length(ref_coefs)
eta <- numeric()
dfs <-
models %>%
map_dfr(Wald_test, vcov = "CR2", constraints = constrain_zero(ref_coefs), test = "HTZ") %>%
mutate(
eta = df_denom + (q - 1)
)
eta <- mean(dfs$eta)
# Combine imputed results
res <- MIcombine(results = coefs, variances = vcovs)
res$coefficients
Cmat <- constrain_zero(ref_coefs, coefs = res$coefficients)
Q <- t(Cmat %*% res$coefficients) %*% solve(Cmat %*% res$variance %*% t(Cmat)) %*% (Cmat %*% res$coefficients)
Fstat <- (eta - q + 1) / (eta * q) * as.numeric(Q)
p_val <- pf(Fstat, df1 = q, df2 = eta - q + 1, lower.tail = FALSE)
tibble(query = which_coefs, Fstat = Fstat, df_num = q, df_denom = eta - q + 1, p_val = p_val)
}
if (file.exists("data/imp_meta_reg_mods.qs")) {
imputed_mods <- qs::qread("data/imp_meta_reg_mods.qs")
} else {
if (!exists("imp")) stop("Imputation object (imp) not found - run/load imputation first.")
job::job("Impute meta-regression" = {
imp_datasets <- complete(imp, "all")
with_progress({
p <- progressor(steps = length(imp_datasets), label = "Imputing meta-regression models")
imputed_mods <- map(imp_datasets, \(data) {
p()
try({
with(data, {
V <- impute_covariance_matrix(
vi = var_adj,
cluster = articlestudy,
r = rho
)
complexity <- factor(complexity, ordered = FALSE) %>% relevel(ref = "high")
interdep <- factor(interdep, ordered = FALSE) %>% relevel(ref = "high")
longevity <- factor(longevity, ordered = FALSE) %>% relevel(ref = "stable")
art_focus <- factor(art_focus) %>% relevel(ref = "focal H")
meas_type <- factor(meas_type) %>% relevel(ref = "Variety")
rater <- factor(rater) %>% relevel(ref = "Objective")
design <- factor(design) %>% relevel(ref = "Observational")
mod <- rma.mv(
r_adj ~ year_merged + complexity + interdep + collectivism +
longevity + meas_type + rater + design + art_focus,
V = V,
random = ~ 1 | articlestudy / effect_id,
test = "t",
dfs = "contain",
sparse = TRUE
)
mod_interact <- rma.mv(
r_adj ~ domain * year_merged + domain * complexity + domain * interdep + domain * collectivism +
domain * longevity + domain * meas_type + domain * rater + domain * design + domain * art_focus,
V = V,
random = ~ 1 | articlestudy / effect_id,
test = "t",
dfs = "contain",
sparse = TRUE
)
list(mod = mod, mod_interact = mod_interact)
})
})
})
imputed_mods %>% qs::qsave("data/imp_meta_reg_mods.qs")
job::export(NULL)
})
})
message("Models will be imputed now in background job - rerun this chunk afterwards to load them.")
}
# Pool results
## Remove failed convergence
m <- length(imputed_mods)
imputed_mods <- imputed_mods %>% discard(\(x) inherits(x, "try-error"))
message("Excluded ", m - length(imputed_mods), " models due to convergence issues.")
## Excluded 2 models due to convergence issues.
coefs_across <- map(imputed_mods, "mod") %>% pool_robust()
coefs_domains <- map(imputed_mods, "mod_interact") %>% pool_robust()
cat_moderators <- c("complexity", "interdep", "longevity", "meas_type", "rater", "design", "art_focus")
cont_moderators <- c("year_merged", "collectivism", "power_distance")
# Run Wald tests
if (file.exists("data/cat_domain_wald_test_multiple.rds")) {
cat_interaction_tests <- read_rds("data/cat_domain_wald_test_multiple.rds")
} else {
cat_moderators_regex <- c(
paste0("^", cat_moderators),
paste0("Cognitive:", cat_moderators),
paste0("Job-related:", cat_moderators)
)
# Loop over domains (^, Cognitive:, Job-related:), merge all results
cat_interaction_tests <- map(imputed_mods, "mod_interact") %>%
{
map_dfr(cat_moderators_regex, \(query) pooled_Wald(., query))
} %>%
mutate(
domain = case_when(
str_detect(query, "^\\^") ~ "Demographic",
str_detect(query, "Cognitive:") ~ "Cognitive",
str_detect(query, "Job-related:") ~ "Job-related"
),
moderator = query %>% str_remove("^\\^") %>%
str_remove("Cognitive:") %>%
str_remove("Job-related:"),
moderator = sapply(moderator, function(coeff) {
matches <- str_detect(coeff, paste0("^", cat_moderators))
if (any(matches)) {
return(cat_moderators[which.max(matches)]) # Returns the first matched moderator
} else {
warning("Failed moderator match: ", coeff)
return(NA)
}
})
)
write_rds(cat_interaction_tests, "data/cat_domain_wald_test_multiple.rds")
}
if (file.exists("data/cat_overall_wald_test_multiple.rds")) {
cat_avg_tests <- read_rds("data/cat_overall_wald_test_multiple.rds")
} else {
cat_avg_tests <- map(imputed_mods, "mod") %>%
{
map_dfr(cat_moderators, \(query) pooled_Wald(., query))
} %>%
mutate(domain = "Overall", moderator = sapply(query, function(coeff) {
matches <- str_detect(coeff, paste0("^", cat_moderators))
if (any(matches)) {
return(cat_moderators[which.max(matches)]) # Returns the first matched moderator
} else {
return(NA)
}
}))
write_rds(cat_avg_tests, "data/cat_overall_wald_test_multiple.rds")
}
# Combine all results into 1 table
coefs_domains %>%
filter(map(cont_moderators, \(m) str_detect(coef, m)) %>% Reduce(`|`, .)) %>%
mutate(
domain = case_when(str_detect(coef, "Cognitive:") ~ "Cognitive",
str_detect(coef, "Job-related:") ~ "Job-related",
.default = "Demographic"
),
moderator = coef %>% str_remove(".*:")
) %>%
bind_rows(coefs_across %>% filter(coef %in% cont_moderators) %>% transmute(domain = "Overall", moderator = coef, p_val)) %>%
bind_rows(cat_interaction_tests, cat_avg_tests) %>%
select(domain, moderator, p_val) %>%
rowwise() %>%
mutate(
p_fmt = paste(fmt_p(p_val, equal_sign = FALSE), sigstars(p_val)),
moderator = var_names$new[var_names$old == moderator], position = which(moderator == mod_table$`_data`$Moderator)
) %>%
select(moderator, domain, p_fmt, position) %>%
pivot_wider(names_from = "domain", values_from = "p_fmt") %>%
arrange(position) %>%
select(Moderator = moderator, Overall, Demographic, Cognitive, `Job-related`) %>%
gt::gt() %>%
tab_header(title = "Multivariate (meta-regression) moderator tests") %>%
tab_spanner(md("Significance tests (*p*-values)"), Overall:`Job-related`) %>%
gt::fmt(fns = md) %>%
gt::tab_source_note(timesaveR:::std_stars %>%
{
glue::glue_collapse(paste(names(.), .), ", ")
} %>% html())
| Multivariate (meta-regression) moderator tests | ||||
| Moderator | Significance tests (p-values) | |||
|---|---|---|---|---|
| Overall | Demographic | Cognitive | Job-related | |
| Complexity | .228 | .569 | .065 †| .172 |
| Interdependence | .361 | .357 | .809 | .691 |
| Longevity | .583 | .921 | .442 | .809 |
| Diversity measure | .381 | .890 | .588 | .376 |
| Performance rater | .814 | .296 | .307 | .022 * |
| Design | .247 | .342 | .495 | .902 |
| Article focus | .008 ** | .191 | .836 | .981 |
| Year of data collection | .787 | .868 | .916 | .606 |
| Collectivism | .169 | .866 | .047 * | .415 |
| †0.1, * 0.05, ** 0.01, *** 0.001 | ||||
Due to its probabilistic nature, metaCART had to be run run
repeatedly (here 50 times on each of 10 datasets, using two different
estimation methods). This is very slow (~24 hours on 16 cores), so was
run on an AWC EC2 instance - see
helpers/aws_code metacart.R. The code below is just
provided for reference - here, results are loaded from the file as
downloaded from AWS.
dataset_aggregated <- dataset %>%
group_by(articlestudy, pub_status, art_focus, year_merged, design, rater, collectivism, interdep,
complexity, longevity, virtuality, domain, sub_dom, meas_type) %>%
mutate(es_aggregate_id = cur_group_id()) %>%
ungroup()
dataset_aggregated <- escalc(yi = r_adj, vi = var_adj, data = dataset_aggregated) %>%
aggregate.escalc(cluster = es_aggregate_id, rho = rho, struct = "CS", na.rm = FALSE)
# Remove escalc class to avoid future issues
dataset_aggregated <- as_tibble(dataset_aggregated)
# Specify ordered factors
dataset_aggregated$longevity <- factor(dataset_aggregated$longevity, levels = c("hours", "days", "weeks", "months", "years", "stable"), ordered = TRUE)
dataset_aggregated$interdep <- factor(dataset_aggregated$interdep, levels = c("low", "medium", "high"), ordered = TRUE)
dataset_aggregated$complexity <- factor(dataset_aggregated$complexity, levels = c("low", "medium", "high"), ordered = TRUE)
qs::qsave(dataset_aggregated, "data/dataset_aggregated_metacart.qs")
dummy_code_df <- function(df, exclude_column, drop_first = TRUE) {
if (!is.character(exclude_column)) {
stop("exclude_column must be a character string.")
}
for (col_name in names(df)) {
if (col_name != exclude_column && is.factor(df[[col_name]]) && !is.ordered(df[[col_name]])) {
# Get levels and potentially drop the first one
levels_to_dummy <- levels(df[[col_name]])
if (drop_first) {
levels_to_dummy <- levels_to_dummy[-1]
}
# Create dummy columns for each level
for (level in levels_to_dummy) {
dummy_name <- paste0(col_name, "_", level) %>% vctrs::vec_as_names(repair = "universal_quiet")
df[[dummy_name]] <- df[[col_name]] == level
}
# Remove the original factor column if not excluded
df[[col_name]] <- NULL
}
}
return(df)
}
# Imputation code derived from mice package (but noise removed to get single best)
# mice copyright Stef van Buuren - thanks
impute_best_polr <- function(dat, var_name) {
dat <- dummy_code_df(dat, var_name)
collinear <- mice:::find.collinear(dat)
if (length(collinear) > 0) {
dat <- dat[-which(names(dat) %in% setdiff(collinear, var_name))]
}
y_missing <- which(is.na(dat[var_name]))
f <- setdiff(colnames(dat), var_name) %>%
glue::glue_collapse(sep = " + ") %>%
paste(var_name, "~", .)
complete_dat <- dat %>% drop_na(everything())
## polr may fail on sparse data. We revert to multinom in such cases.
fit <- try({
MASS::polr(as.formula(f),
data = complete_dat,
)
post <- predict(fit, dat[is.na(dat[var_name]), ], type = "probs")
})
if (inherits(fit, "try-error")) {
message("polr falls back to multinom")
fit <- nnet::multinom(as.formula(f),
data = complete_dat,
maxit = 100, trace = FALSE,
MaxNWts = 1500
)
post <- predict(fit, dat[is.na(dat[var_name]), ], type = "probs")
}
if (length(y_missing) == 1) {
temp <- matrix(post, nrow = 1, ncol = length(post))
colnames(temp) <- names(post)
post <- temp
}
max_col_index <- apply(post, 1, \(x) {
if (all(is.na(x))) {
return(NA)
} else {
return(which(x == max(x, na.rm = TRUE)))
}
})
new_y <- colnames(post)[max_col_index]
if (length(dat[[var_name]][y_missing]) != length(new_y)) browser()
dat[[var_name]][y_missing] <- new_y
message("Imputed ", sum(!is.na(dat[[var_name]][y_missing])), " values.")
dat[[var_name]]
}
impute_best_polyreg <- function(dat, var_name) {
dat <- dummy_code_df(dat, var_name)
collinear <- mice:::find.collinear(dat)
if (length(collinear) > 0) {
dat <- dat[-which(names(dat) %in% setdiff(collinear, var_name))]
}
y_missing <- which(is.na(dat[var_name]))
f <- setdiff(colnames(dat), var_name) %>%
glue::glue_collapse(sep = " + ") %>%
paste(var_name, "~", .)
complete_dat <- dat %>% drop_na(everything())
fit <- nnet::multinom(as.formula(f),
data = complete_dat,
maxit = 100, trace = FALSE,
MaxNWts = 3000
)
tryCatch(
{
post <- predict(fit, dat[is.na(dat[var_name]), ], type = "probs")
if (length(y_missing) == 1) {
temp <- matrix(post, nrow = 1, ncol = length(post))
colnames(temp) <- names(post)
post <- temp
}
max_col_index <- apply(post, 1, \(x) {
if (all(is.na(x))) {
return(NA)
} else {
return(which(x == max(x, na.rm = TRUE)))
}
})
new_y <- colnames(post)[max_col_index]
dat[[var_name]][y_missing] <- new_y
},
error = \(e) {
warning("Prediction failed")
return(dat[[var_name]])
}
)
message("Imputed ", sum(!is.na(dat[[var_name]][y_missing])), " values.")
dat[[var_name]]
}
# Imputation strategy: identify missing patterns, then impute for each pattern so that always the largest available set of predictors is used
missing_patterns <- function(dataset, target_var) {
dataset <- dataset[is.na(dataset[[target_var]]), ]
dataset %>%
select(-matches(target_var)) %>%
mutate(missing_vars = pmap_chr(., function(...) {
vars <- names(c(...))
missing <- vars[is.na(c(...))]
if (length(missing) == 0) {
return(NA_character_)
} # Return NA if no missing vars
paste(missing, collapse = "|")
})) %>%
mutate(across(-missing_vars, ~ as.numeric(is.na(.)))) %>%
mutate(m = rowSums(across(-missing_vars), na.rm = TRUE)) %>%
group_by(missing_vars, m) %>%
summarise(n = n(), .groups = "drop") %>%
arrange(m, -n)
}
impute_sequentially <- function(dataset, target_var, fun) {
missings <- missing_patterns(dataset, target_var)
if (any(is.na(missings$missing_vars))) {
dataset[[target_var]] <- do.call(fun, list(dataset, target_var))
}
missings <- missings %>% filter(!is.na(missing_vars))
if (nrow(missings) == 0) {
return(dataset[[target_var]])
}
for (i in 1:nrow(missings)) {
dataset[[target_var]] <- do.call(fun, list(dataset %>% select(-matches(missings[i, ]$missing_vars)), target_var))
}
dataset[[target_var]]
}
# For "worst case", impute by drawing random values from observed data
impute_with_random_draw <- function(dataframe) {
for (col in names(dataframe)) {
missing_indices <- which(is.na(dataframe[[col]]))
if (length(missing_indices) > 0) {
non_missing_values <- dataframe[[col]][!is.na(dataframe[[col]])]
if (length(non_missing_values) > 0) {
dataframe[[col]][missing_indices] <- sample(non_missing_values, length(missing_indices), replace = TRUE)
}
}
}
return(dataframe)
}
n_datasets <- 10
n_trees <- 50
if (!(file.exists("data/metacart_trees_ahead.qs")) || !(file.exists("data/metacart_trees_standard.qs"))) { # Do not run if estimation has already been done
if (!(file.exists("data/cart_imputed_best.qs")) || !(file.exists("data/cart_imputed_random.qs"))) { # Do not run if imputation has already been done
set.seed(2124)
dataset_aggregated <- qs::qread("data/dataset_aggregated_metacart.qs")
datasets <- list()
datasets_imp_ahead <- list()
datasets_imp_standard <- list()
# specify variables to include in imputation (those in metaCart and relevant other preds)
mi_vars <- c(
"art_focus", "pub_status", "design", "setting", "ind_sector", "team_function",
"n_teams", "stud_sample",
"tmt", "domain", "sub_dom", "meas_type", "rater", "interdep", "complexity", "longevity", "power_distance",
"collectivism", "year_merged", "citation_count"
)
# Resample datasets
for (i in seq_len(n_datasets)) {
datasets[[i]] <- dataset_aggregated %>%
group_by(articlestudy, domain) %>%
slice_sample(n = 1) %>%
ungroup() %>%
select(articlestudy, domain, r_adj, se, var_adj, all_of(mi_vars)) %>%
mutate(across(c(where(is.character), -articlestudy), factor))
}
# Impute using best prediction
## Define models
polr_vars <- c("interdep", "complexity", "longevity")
polyreg_vars <- c(
"art_focus", "pub_status", "design", "setting", "ind_sector", "team_function",
"stud_sample", "tmt", "domain", "meas_type", "rater"
)
special_var <- c("sub_dom")
pmm_vars <- c("power_distance", "collectivism")
for (i in seq_along(datasets)) {
datasets_imp_ahead[[i]] <- datasets[[i]]
for (var in polr_vars) {
if (any(is.na(datasets_imp_ahead[[i]][[var]]))) {
message("\n\n Imputing ", var)
datasets_imp_ahead[[i]][[var]] <- impute_sequentially(datasets_imp_ahead[[i]] %>% select(-articlestudy), var, impute_best_polr)
}
}
for (var in polyreg_vars) {
if (any(is.na(datasets_imp_best[[i]][[var]]))) {
message("\n\n Imputing ", var)
datasets_imp_best[[i]][[var]] <- impute_sequentially(datasets_imp_best[[i]] %>% select(-articlestudy), var, impute_best_polyreg)
}
}
# Impute sub-domains by domain, to ensure valid values
temp_split <- split(datasets_imp_best[[i]], datasets_imp_best[[i]]$domain)
for (domain in names(temp_split)) {
temp_split[[domain]]$sub_dom <- impute_sequentially(temp_split[[domain]] %>% select(-articlestudy), "sub_dom", impute_best_polyreg)
}
datasets_imp_best[[i]] <- bind_rows(temp_split)
# For continuous variables, use PMM function in mice
imp <- mice(datasets_imp_best[[i]], maxit = 0)
methods <- imp$method
methods[] <- ""
methods[pmm_vars] <- "pmm"
preds <- imp$predictorMatrix
preds[] <- 0
preds[pmm_vars, mi_vars] <- 1
preds[pmm_vars, "r_adj"] <- 1
preds[pmm_vars, "se"] <- 1
preds[pmm_vars, pmm_vars] <- 0 # Always missing together, so this would not help
imp <- mice(datasets_imp_best[[i]],
m = 1, seed = 2124 + i, predictorMatrix = preds,
method = methods, donors = 1
)
datasets_imp_best[[i]] <- datasets_imp_best[[i]] %>%
select(-all_of(pmm_vars)) %>%
left_join(imp %>% complete() %>% select(articlestudy, domain, all_of(pmm_vars)))
}
# Impute using random prediction
for (i in seq_along(datasets)) {
datasets_imp_rand[[i]] <- datasets[[i]]
datasets_imp_rand[[i]] <- datasets_imp_rand[[i]] %>%
select(-sub_dom) %>%
impute_with_random_draw() %>%
left_join(datasets_imp_rand[[i]] %>% select(articlestudy, domain, sub_dom))
# Impute sub-domains by domain, to ensure valid values
temp_split <- split(datasets_imp_rand[[i]], datasets_imp_rand[[i]]$domain)
for (domain in names(temp_split)) {
temp_split[[domain]] <- impute_with_random_draw(temp_split[[domain]])
}
datasets_imp_rand[[i]] <- bind_rows(temp_split)
}
datasets_imp_best %>% qs::qsave("data/cart_imputed_best.qs")
datasets_imp_rand %>% qs::qsave("data/cart_imputed_random.qs")
} else {
datasets_imp_rand <- qs::qread("data/cart_imputed_random.qs")
datasets_imp_best <- qs::qread("data/cart_imputed_best.qs")
if (!length(datasets_imp_best) == n_datasets || !length(datasets_imp_rand) == n_datasets) {
warning("Number of loaded datasets does not macth `n_datasets`. Consider re-running imputation.")
}
}
}
run_meta_cart <- function(data, reps, seed = NULL, method = c("standard", "ahead")) {
if (!is.null(seed)) {
set.seed(seed)
}
source("helpers/metacart_fix.R") # So that it is available when running in parallel
cli::cli_progress_bar(name = "Fitting trees", total = reps)
if (method[1] == "standard") {
mods <- list()
for (i in seq_len(reps)) {
mods[i] <- withCallingHandlers({ # Suppress warnings about no moderator effect
mod <- try(list(REmrt(r_adj ~ domain + year_merged + complexity + interdep + collectivism + meas_type + rater + design + art_focus + longevity,
data,
vi = var_adj,
c = 0,
maxL = 20,
xval = 10,
lookahead = FALSE
)), silent = TRUE)
if (inherits(mod, "try-error")) { # Failed on one dataset (best/job-related) in very few cases due to bug in underlying C code.
cli::cli_alert_warning("Failed to fit model ", i)
mod <- NULL
}
mod
}, warning=function(w) {
if (endsWith(conditionMessage(w), "no moderator effect was detected"))
invokeRestart("muffleWarning")
})
cli::cli_progress_update()
}
cli::cli_progress_done()
return(mods)
}
if (method[1] == "ahead") {
mods_ahead <- list()
for (i in seq_len(reps)) {
mods_ahead[i] <- withCallingHandlers({
mod <- try(list(REmrt(r_adj ~ domain + year_merged + complexity + interdep + collectivism + meas_type + rater + design + art_focus + longevity,
data,
vi = var_adj,
c = 0,
maxL = 20,
xval = 10,
lookahead = TRUE
)), silent = TRUE)
if (inherits(mod, "try-error")) {
cli::cli_alert_warning("Failed to fit lookahead model ", i)
mod <- NULL
}
mod
}, warning=function(w) {
if (endsWith(conditionMessage(w), "no moderator effect was detected"))
invokeRestart("muffleWarning")
})
cli::cli_progress_update()
}
cli::cli_progress_done()
return(mods_ahead)
} else {
stop("Invalid method specified")
}
}
if (!(file.exists("data/metacart_trees_ahead.qs")) || !(file.exists("data/metacart_trees_standard.qs"))) { #Do not run if estimation has already been done
datasets_imp_best_list <- list()
datasets_imp_rand_list <- list()
datasets_imp_best <- datasets_imp_best %>% set_names(as.character(seq_along(.)))
datasets_imp_rand <- datasets_imp_rand %>% set_names(as.character(seq_along(.)))
for (i in seq_along(datasets_imp_best)) {
datasets_imp_best_list[[i]] <- split(datasets_imp_best[[i]], datasets_imp_best[[i]]$domain)
datasets_imp_best_list[[i]][["Overall"]] <- datasets_imp_best[[i]]
}
datasets_imp_best_list <- list_transpose(datasets_imp_best_list)
for (i in seq_along(datasets_imp_best)) {
datasets_imp_rand_list[[i]] <- split(datasets_imp_rand[[i]], datasets_imp_rand[[i]]$domain)
datasets_imp_rand_list[[i]][["Overall"]] <- datasets_imp_rand[[i]]
}
datasets_imp_rand_list <- list_transpose(datasets_imp_rand_list)
datasets_imp <- list(best = datasets_imp_best_list, rand = datasets_imp_rand_list)
datasets_imp <- datasets_imp %>%
list_flatten() %>%
list_flatten()
datasets_imp <- datasets_imp[order(sapply(datasets_imp, nrow))]
cores <- future::availableCores()
plan(multisession, workers = cores)
}
if (!(file.exists("data/metacart_trees_standard.qs"))) {
trees_standard <- future_imap(datasets_imp,
~{
# Extract the index from names to use as seed offset
index <- which(names(datasets_imp) == .y)
# Calculate a unique seed for each run
specific_seed <- 12345 + index
# Run the function with the specific seed
result <- run_meta_cart(.x, reps = n_trees, seed = specific_seed, method = "standard")
# Save the result using the name of the dataset
qsave(result, paste0(.y, "_standard.qs"))
# Return result if needed
result
},
.progress = TRUE, .options = furrr_options(seed = TRUE))
qs::qsave(trees_standard, "data/metacart_trees_standard.qs")
} else {
trees <- qs::qread("data/metacart_trees_standard.qs")
}
if (!(file.exists("data/metacart_trees_ahead.qs"))) {
trees_ahead <- future_imap(datasets_imp,
~{
# Extract the index from names to use as seed offset
index <- which(names(datasets_imp) == .y)
# Calculate a unique seed for each run
specific_seed <- 12345 + index
# Run the function with the specific seed
result <- run_meta_cart(.x, reps = n_trees, seed = specific_seed, method = "ahead")
# Save the result using the name of the dataset
qsave(result, paste0(.y, "_ahead.qs"))
# Return result if needed
result
},
.progress = TRUE, .options = furrr_options(seed = TRUE))
qs::qsave(trees_ahead, "data/metacart_trees_ahead.qs")
} else {
trees_ahead <- qs::qread("data/metacart_trees_ahead.qs")
}
trees_standard <- qs::qread("data/metacart_trees_standard.qs")
trees_ahead <- qs::qread("data/metacart_trees_ahead.qs")
names(trees_standard) <- paste0(names(trees_standard), "_standard")
names(trees_ahead) <- paste0(names(trees_ahead), "_ahead")
trees <- c(trees_standard, trees_ahead)
get_mode <- function(x) {
tbl <- table(x)
mode_val <- names(tbl)[which(tbl == max(tbl, na.rm = TRUE))]
paste(mode_val, collapse = " | ")
}
leaf_counts <- trees %>%
imap_dfr(\(mods, dataset) {
mods %>% discard(is.null) %>% map_int(\(mod) length(mod$n)) %>% tibble(n = .) %>%
summarise(mode = get_mode(n), ns = list(n)) %>%
mutate(dataset = dataset)
}) %>%
separate(dataset, into = c("imp_method", "domain", "run", "cart_method"), sep = "_")
leaf_counts %>% group_by(imp_method, domain, cart_method) %>%
summarise(mode = get_mode(unlist(ns)),
.groups = "drop") %>%
arrange(desc(mode)) %>%
gt() %>% tab_header("Modal number of leaves in metaCART trees (based on 500 runs per row)") %>%
tab_source_note("Note: 'standard' method uses default method, 'ahead' method uses the lookahead algorithm better suited to continuous moderators.") %>%
gt_apa_style()
| Modal number of leaves in metaCART trees (based on 500 runs per row) | |||
| imp_method | domain | cart_method | mode |
|---|---|---|---|
| best | Overall | ahead | 3 |
| rand | Job-related | ahead | 3 |
| rand | Overall | ahead | 3 |
| best | Cognitive | ahead | 1 |
| best | Cognitive | standard | 1 |
| best | Demographic | ahead | 1 |
| best | Demographic | standard | 1 |
| best | Job-related | ahead | 1 |
| best | Job-related | standard | 1 |
| best | Overall | standard | 1 |
| rand | Cognitive | ahead | 1 |
| rand | Cognitive | standard | 1 |
| rand | Demographic | ahead | 1 |
| rand | Demographic | standard | 1 |
| rand | Job-related | standard | 1 |
| rand | Overall | standard | 1 |
| Note: 'standard' method uses default method, 'ahead' method uses the lookahead algorithm better suited to continuous moderators. | |||
best_Overall <- trees[which(str_detect(names(trees), "best_Overall.*_ahead"))] %>% list_flatten()
which_3_leaves <- which(sapply(best_Overall, \(x) length(x$n) == 3))
moderators <- best_Overall[which_3_leaves] %>% map("moderators") %>% map_chr(~paste(collapse = " | ", .))
table(moderators)[1] %>% {glue::glue("Most frequent moderator combination for 'best' imputation of full dataset: {names(.)} occurred {.} times (out of 500 runs).")}
## Most frequent moderator combination for 'best' imputation of full dataset: collectivism occurred 255 times (out of 500 runs).
which_collectivism <- which(moderators == "collectivism")
mods_best <- best_Overall[which_3_leaves][which_collectivism] %>%
enframe(name = "names", value = "data") %>%
separate(names, into = c(NA, NA, "dataset", NA, NA), sep = "_") %>%
group_by(dataset) %>%
slice_head(n = 1) %>%
ungroup() %>%
pull(data)
rand_Overall <- trees[which(str_detect(names(trees), "rand_Overall.*_ahead"))] %>% list_flatten()
which_3_leaves <- which(sapply(rand_Overall, \(x) length(x$n) == 3))
moderators <- rand_Overall[which_3_leaves] %>% map("moderators") %>% map_chr(~paste(collapse = " | ", .))
table(moderators)[1] %>% {glue::glue("Most frequent moderator combination for 'random' imputation of full dataset: {names(.)} occurred {.} times (out of 500 runs).")}
## Most frequent moderator combination for 'random' imputation of full dataset: collectivism occurred 236 times (out of 500 runs).
which_collectivism <- which(moderators == "collectivism")
mods_rand <- rand_Overall[which_3_leaves][which_collectivism] %>%
enframe(name = "names", value = "data") %>%
separate(names, into = c(NA, NA, "dataset", NA, NA), sep = "_") %>%
group_by(dataset) %>%
slice_head(n = 1) %>%
ungroup() %>%
pull(data)
ps_best <- map(mods_best, metacart_plot)
ps_rand <- map(mods_rand, metacart_plot)
wrap_plots(ps_best) + plot_annotation(title = "metaCART results for full dataset", subtitle = "... 'best' imputation")
wrap_plots(ps_rand) + plot_annotation(subtitle = "... 'random' imputation")
Overall, these models reliably picks out a small cluster of countries
(Turkey, United Arab Emirates and Kuwait, though 5/7 studies with
observed collectivism in the 62-63 range are from Turkey). These studies
obviously differ from others based on other moderators as well - the
following table shows their values on all moderators, so that the reader
can decide how to interpret the finding that they have a substantially
greater correlation.
dataset %>% filter(collectivism > 61 & collectivism < 64) %>% group_by(id) %>% slice_head(n=1) %>% ungroup() %>% select(author_year, country, all_of(c(cat_moderators, cont_moderators, expl_mods)), -country_grp) %>%
DT::datatable(options = list(scrollX = TRUE, lengthChange = FALSE, searching = FALSE, paging = FALSE)
)
rand_Job <- trees[which(str_detect(names(trees), "rand_Job.*_ahead"))] %>% list_flatten()
which_3_leaves <- which(sapply(rand_Job, \(x) length(x$n) == 3))
moderators <- rand_Job[which_3_leaves] %>% map("moderators") %>% map_chr(~paste(collapse = " | ", .))
table(moderators)[1] %>% {glue::glue("Most frequent moderator combination for 'rand' imputation of job-related dataset: {names(.)} occurred {.} times (out of 500 runs).")}
## Most frequent moderator combination for 'rand' imputation of job-related dataset: collectivism occurred 186 times (out of 500 runs).
which_collectivism <- which(moderators == "collectivism")
mods_rand <- rand_Job[which_3_leaves][which_collectivism] %>%
enframe(name = "names", value = "data") %>%
separate(names, into = c(NA, NA, "dataset", NA, NA), sep = "_") %>%
group_by(dataset) %>%
slice_head(n = 1) %>%
ungroup() %>%
pull(data)
best_Job <- trees[which(str_detect(names(trees), "best_Job.*_ahead"))] %>% list_flatten()
which_3_leaves <- which(sapply(best_Job, \(x) length(x$n) == 3))
moderators <- best_Job[which_3_leaves] %>% map("moderators") %>% map_chr(~paste(collapse = " | ", .))
table(moderators)[1] %>% {glue::glue("Most frequent moderator combination for 'rand' imputation of job-related dataset: {names(.)} occurred {.} times (out of 500 runs).")}
## Most frequent moderator combination for 'rand' imputation of job-related dataset: collectivism occurred 80 times (out of 500 runs).
which_collectivism <- which(moderators == "collectivism")
mods_best <- best_Job[which_3_leaves][which_collectivism] %>%
enframe(name = "names", value = "data") %>%
separate(names, into = c(NA, NA, "dataset", NA, NA), sep = "_") %>%
group_by(dataset) %>%
slice_head(n = 1) %>%
ungroup() %>%
pull(data)
ps_rand <- map(mods_rand, metacart_plot)
ps_best <- map(mods_best, metacart_plot)
wrap_plots(ps_rand) + plot_annotation(title = "metaCART results for Job-related dataset", subtitle = "... 'random' imputation")
wrap_plots(ps_best) + plot_annotation(subtitle = "... 'best' imputation", caption = "NB: Modal number of leaves was 1 for this dataset - this is only to consider consistency with 'random' imputation.")